Close

@InProceedings{CoelhoPratSchw:2021:GeApFa,
               author = "Coelho, Luiz Eduardo Lima and Prates, Raphael Felipe de Carvalho 
                         and Schwartz, William Robson",
                title = "A generative approach for face mask removal using audio and 
                         appearance",
            booktitle = "Proceedings...",
                 year = "2021",
               editor = "Paiva, Afonso and Menotti, David and Baranoski, Gladimir V. G. and 
                         Proen{\c{c}}a, Hugo Pedro and Junior, Antonio Lopes Apolinario 
                         and Papa, Jo{\~a}o Paulo and Pagliosa, Paulo and dos Santos, 
                         Thiago Oliveira and e S{\'a}, Asla Medeiros and da Silveira, 
                         Thiago Lopes Trugillo and Brazil, Emilio Vital and Ponti, Moacir 
                         A. and Fernandes, Leandro A. F. and Avila, Sandra",
         organization = "Conference on Graphics, Patterns and Images, 34. (SIBGRAPI)",
            publisher = "IEEE Computer Society",
              address = "Los Alamitos",
             keywords = "computer vision, GAN, image inpainting.",
             abstract = "Since the COVID-19 pandemic, the use of facial masks in public 
                         spaces or during people gatherings has become common. Therefore, 
                         journalists, reporters, and interviewees frequently use a mask, 
                         following the public health measures to contain the pandemic. 
                         However, using a mask while speaking or conducting a presentation 
                         can be uncomfortable for viewers. Furthermore, the usage of a mask 
                         prevents lip reading, which can harm the speech comprehension of 
                         people with hearing impairment. Thus, this work aims at 
                         artificially removing masks in videos while recovering the lip 
                         movements using the audio and uncovered face features. We use the 
                         audio to infer the lip movement in a way it matches with the 
                         uttered phrase. From the audio, we estimate landmarks representing 
                         the mouth structure. Finally, the landmarks (i.e. uncovered and 
                         estimated) are the input in a generative adversarial network (GAN) 
                         that reconstructs the full face image with the mouth in a correct 
                         shape. We present quantitative results in the form of evaluation 
                         metrics and qualitative results in the form of visual examples.",
  conference-location = "Gramado, RS, Brazil (virtual)",
      conference-year = "18-22 Oct. 2021",
                  doi = "10.1109/SIBGRAPI54419.2021.00040",
                  url = "http://dx.doi.org/10.1109/SIBGRAPI54419.2021.00040",
             language = "en",
                  ibi = "8JMKD3MGPEW34M/45CTJ95",
                  url = "http://urlib.net/ibi/8JMKD3MGPEW34M/45CTJ95",
           targetfile = "SIBGRAPI_65.pdf",
        urlaccessdate = "2024, May 06"
}


Close